In [34]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from itertools import combinations
from sklearn.metrics import explained_variance_score, r2_score
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg1 = linear_model.BayesianRidge()
reg2 = linear_model.RANSACRegressor()
reg3 = linear_model.LassoLars(alpha=.1)
reg4 = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
reg5 = linear_model.ElasticNetCV()
reg6 = linear_model.SGDRegressor()
reg7 = linear_model.PassiveAggressiveRegressor()
reg8 = linear_model.TheilSenRegressor()
reg9 = linear_model.HuberRegressor()
from sklearn import svm
clf = svm.SVR(C=1.0, epsilon=0.2)
models = [clf, reg, reg1,reg3,reg5]
models_desc = [x.__class__.__name__ for x in models]
In [35]:
df = pd.read_csv('../datasets/UnnormalizedCrimeData.csv');
df = df.replace('?',np.NAN)
goal_features = ['murders', 'murdPerPop', 'rapes', 'rapesPerPop', 'robberies','robbbPerPop',
'assaults', 'assaultPerPop', 'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop',
'autoTheft', 'autoTheftPerPop', 'arsons', 'arsonsPerPop', 'violentPerPop', 'nonViolPerPop']
non_predictive_features = ['communityname', 'state', 'countyCode', 'communityCode', 'fold']
features = [x for x in df.columns if x not in goal_features and x not in non_predictive_features]
len(features)
Out[35]:
In [36]:
def drop_rows_with_null_goal_feature(old_df, feature):
new_df = old_df.dropna(subset=[feature])
return new_df
In [37]:
x_count = 0
r2_scores =[]
for goal_feature in goal_features:
r2_scores.append([])
goal_df = drop_rows_with_null_goal_feature(df, goal_feature)
goal_df[[goal_feature]] = goal_df[[goal_feature]].apply(pd.to_numeric)
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(goal_df[features])
imputed_data = imr.transform(goal_df[features]);
df_X_train, df_X_test, df_y_train, df_y_test = \
train_test_split(imputed_data, goal_df[goal_feature], test_size=0.2)
y_count = 0
for model in models:
model.fit(df_X_train, df_y_train)
r2_scores[-1].append(r2_score(df_y_test, model.predict(df_X_test)))
#print r2_scores
xxx = [ct+1 for ct in range(len(models))]
plt.plot(xxx, r2_scores[-1],label = goal_feature)
#plt.plot(models_desc,r2_scores[-1])
plt.xticks(xxx, models_desc, rotation='vertical')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.6),
ncol=3, fancybox=True, shadow=True)
In [38]:
bayesian_score = [r2_scores[ip][2] for ip in range(len(r2_scores))]
arr = np.array(bayesian_score)
order = arr.argsort()
In [39]:
print "BayesianRidge Predictability:\n\n"
for i in order:
print str(goal_features[i]) + " " + str(round(bayesian_score[i],3)*100)